This tutorial is a replicate of kaggle kernel writen by Heads or Tails. The goal of this EDA is to predict the statistics of visotors in two touristics areas in Japon. The challenge was published by kaggle.
#library(readr)
library(tibble)
library('stringr') # string manipulation
library('data.table') # data manipulation
library(dplyr)##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
##
## between, first, last
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(lubridate)##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:data.table':
##
## hour, isoweek, mday, minute, month, quarter, second, wday,
## week, yday, year
## The following object is masked from 'package:base':
##
## date
library(ggplot2)
library(grid)
library(leaflet)
library(tidyr)
library(ggExtra)
library(ggridges)##
## Attaching package: 'ggridges'
## The following object is masked from 'package:ggplot2':
##
## scale_discrete_manual
air_visit <- as_tibble(fread(str_c("air_visit_data.csv")))
air_reserve <- as_tibble(fread(str_c('air_reserve.csv')))
air_store_info <- as_tibble(fread(str_c('air_store_info.csv')))
date_info <- as_tibble(fread(str_c('date_info.csv')))
hpg_reserve <- as_tibble(fread(str_c('hpg_reserve.csv')))
hpg_store_info <- as_tibble((fread(str_c('hpg_store_info.csv'))))
sample_submission <- as_tibble(fread(str_c('sample_submission.csv')))
store_id_relation <- as_tibble(fread(str_c('store_id_relation.csv')))# Define multiple plot function
#
# ggplot objects can be passed in ..., or to plotlist (as a list of ggplot objects)
# - cols: Number of columns in layout
# - layout: A matrix specifying the layout. If present, 'cols' is ignored.
#
# If the layout is something like matrix(c(1,2,3,3), nrow=2, byrow=TRUE),
# then plot 1 will go in the upper left, 2 will go in the upper right, and
# 3 will go all the way across the bottom.
#
multiplot <- function(..., plotlist=NULL, file, cols=1, layout=NULL) {
# Make a list from the ... arguments and plotlist
plots <- c(list(...), plotlist)
numPlots = length(plots)
# If layout is NULL, then use 'cols' to determine layout
if (is.null(layout)) {
# Make the panel
# ncol: Number of columns of plots
# nrow: Number of rows needed, calculated from # of cols
layout <- matrix(seq(1, cols * ceiling(numPlots/cols)),
ncol = cols, nrow = ceiling(numPlots/cols))
}
if (numPlots==1) {
print(plots[[1]])
} else {
# Set up the page
grid.newpage()
pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))
# Make each plot, in the correct location
for (i in 1:numPlots) {
# Get the i,j matrix positions of the regions that contain this subplot
matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))
print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,
layout.pos.col = matchidx$col))
}
}
}summary(air_visit)## air_store_id visit_date visitors
## Length:252108 Length:252108 Min. : 1.00
## Class :character Class :character 1st Qu.: 9.00
## Mode :character Mode :character Median : 17.00
## Mean : 20.97
## 3rd Qu.: 29.00
## Max. :877.00
tibble::glimpse(air_visit)## Observations: 252,108
## Variables: 3
## $ air_store_id <chr> "air_ba937bf13d40fb24", "air_ba937bf13d40fb24", "ai…
## $ visit_date <chr> "2016-01-13", "2016-01-14", "2016-01-15", "2016-01-…
## $ visitors <int> 25, 32, 29, 22, 6, 9, 31, 21, 18, 26, 21, 11, 24, 2…
The tibble shows the number of visitors by store for each date.
air_visit %>% dplyr::distinct(air_store_id) %>% nrow()## [1] 829
air_visit %>% dplyr::distinct(visit_date) %>% nrow()## [1] 478
There is 829 different store and 478 dates
summary(air_reserve)## air_store_id visit_datetime reserve_datetime
## Length:92378 Length:92378 Length:92378
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## reserve_visitors
## Min. : 1.000
## 1st Qu.: 2.000
## Median : 3.000
## Mean : 4.482
## 3rd Qu.: 5.000
## Max. :100.000
glimpse(air_reserve)## Observations: 92,378
## Variables: 4
## $ air_store_id <chr> "air_877f79706adbfb06", "air_db4b38ebe7a7ceff",…
## $ visit_datetime <chr> "2016-01-01 19:00:00", "2016-01-01 19:00:00", "…
## $ reserve_datetime <chr> "2016-01-01 16:00:00", "2016-01-01 19:00:00", "…
## $ reserve_visitors <int> 1, 3, 6, 2, 5, 2, 4, 2, 2, 2, 3, 3, 2, 6, 7, 41…
The number of reservation is 92378, distributed for 314 air store.
air_reserve %>% dplyr::distinct(air_store_id) %>% nrow()## [1] 314
summary(hpg_reserve)## hpg_store_id visit_datetime reserve_datetime
## Length:2000320 Length:2000320 Length:2000320
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## reserve_visitors
## Min. : 1.000
## 1st Qu.: 2.000
## Median : 3.000
## Mean : 5.074
## 3rd Qu.: 6.000
## Max. :100.000
tibble::glimpse(hpg_reserve)## Observations: 2,000,320
## Variables: 4
## $ hpg_store_id <chr> "hpg_c63f6f42e088e50f", "hpg_dac72789163a3f47",…
## $ visit_datetime <chr> "2016-01-01 11:00:00", "2016-01-01 13:00:00", "…
## $ reserve_datetime <chr> "2016-01-01 09:00:00", "2016-01-01 06:00:00", "…
## $ reserve_visitors <int> 1, 3, 2, 5, 13, 2, 2, 2, 2, 6, 2, 2, 2, 2, 5, 4…
There are 2000320 revervation for 13325 stores
hpg_reserve %>% dplyr::distinct(hpg_store_id) %>% nrow()## [1] 13325
summary(air_store_info)## air_store_id air_genre_name air_area_name latitude
## Length:829 Length:829 Length:829 Min. :33.21
## Class :character Class :character Class :character 1st Qu.:34.70
## Mode :character Mode :character Mode :character Median :35.66
## Mean :35.65
## 3rd Qu.:35.69
## Max. :44.02
## longitude
## Min. :130.2
## 1st Qu.:135.3
## Median :139.7
## Mean :137.4
## 3rd Qu.:139.8
## Max. :144.3
tibble::glimpse(air_store_info)## Observations: 829
## Variables: 5
## $ air_store_id <chr> "air_0f0cdeee6c9bf3d7", "air_7cc17a324ae5c7dc", "…
## $ air_genre_name <chr> "Italian/French", "Italian/French", "Italian/Fren…
## $ air_area_name <chr> "Hyōgo-ken Kōbe-shi Kumoidōri", "Hyōgo-ken Kōbe-s…
## $ latitude <dbl> 34.69512, 34.69512, 34.69512, 34.69512, 35.65807,…
## $ longitude <dbl> 135.1979, 135.1979, 135.1979, 135.1979, 139.7516,…
The number of air area
air_store_info %>% dplyr::distinct(air_area_name) %>% nrow()## [1] 103
The number of stores
air_store_info %>% dplyr::distinct(air_store_id) %>% nrow()## [1] 829
The number of latitude
air_store_info %>% dplyr::distinct(latitude) %>% nrow()## [1] 108
The number of longitude
air_store_info %>% dplyr::distinct(longitude) %>% nrow()## [1] 108
the number of cuisine genre in air
air_store_info %>% dplyr::distinct(air_genre_name) %>% nrow()## [1] 14
summary(hpg_store_info)## hpg_store_id hpg_genre_name hpg_area_name latitude
## Length:4690 Length:4690 Length:4690 Min. :33.31
## Class :character Class :character Class :character 1st Qu.:34.69
## Mode :character Mode :character Mode :character Median :35.66
## Mean :35.81
## 3rd Qu.:35.70
## Max. :43.77
## longitude
## Min. :130.3
## 1st Qu.:135.5
## Median :139.5
## Mean :137.7
## 3rd Qu.:139.7
## Max. :143.7
tibble::glimpse(hpg_store_info)## Observations: 4,690
## Variables: 5
## $ hpg_store_id <chr> "hpg_6622b62385aec8bf", "hpg_e9e068dd49c5fa00", "…
## $ hpg_genre_name <chr> "Japanese style", "Japanese style", "Japanese sty…
## $ hpg_area_name <chr> "Tōkyō-to Setagaya-ku Taishidō", "Tōkyō-to Setaga…
## $ latitude <dbl> 35.64367, 35.64367, 35.64367, 35.64367, 35.64367,…
## $ longitude <dbl> 139.6682, 139.6682, 139.6682, 139.6682, 139.6682,…
There are 4690 stores
hpg_store_info %>% dplyr::distinct(latitude) %>% nrow()## [1] 129
There are 119 area name
hpg_store_info %>% dplyr::distinct(hpg_area_name) %>% nrow()## [1] 119
There are 34 genres of cuisine in HPG
hpg_store_info %>% dplyr::distinct(hpg_genre_name) %>% nrow()## [1] 34
glimpse(date_info)## Observations: 517
## Variables: 3
## $ calendar_date <chr> "2016-01-01", "2016-01-02", "2016-01-03", "2016-01…
## $ day_of_week <chr> "Friday", "Saturday", "Sunday", "Monday", "Tuesday…
## $ holiday_flg <int> 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,…
glimpse(store_id_relation)## Observations: 150
## Variables: 2
## $ air_store_id <chr> "air_63b13c56b7201bd9", "air_a24bf50c3e90d583", "ai…
## $ hpg_store_id <chr> "hpg_4bc649e72e2a239a", "hpg_c34b496d0305a809", "hp…
There are 150 relation between Air (829 stores) and HPG (4690 stores) stores.
tibble::glimpse(sample_submission)## Observations: 32,019
## Variables: 2
## $ id <chr> "air_00a91d42b08b08d9_2017-04-23", "air_00a91d42b08b08d…
## $ visitors <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
the submision file should be two columns. The first one is the cancatenation of store_id and date. the 2sd is the number of visitors.
purrr::map(ls(), is.na)## [[1]]
## [1] FALSE
##
## [[2]]
## [1] FALSE
##
## [[3]]
## [1] FALSE
##
## [[4]]
## [1] FALSE
##
## [[5]]
## [1] FALSE
##
## [[6]]
## [1] FALSE
##
## [[7]]
## [1] FALSE
##
## [[8]]
## [1] FALSE
##
## [[9]]
## [1] FALSE
air_visit <- air_visit %>% mutate(visit_date = ymd(visit_date))
air_reserve <- air_reserve %>% mutate(visit_datetime = ymd_hms(visit_datetime),
reserve_datetime = ymd_hms(reserve_datetime))
hpg_reserve <- hpg_reserve %>% mutate(visit_datetime = ymd_hms(visit_datetime),
reserve_datetime = ymd_hms(reserve_datetime))
air_store_info <- air_store_info %>% mutate(air_genre_name = as.factor(air_genre_name),
air_area_name = as.factor(air_area_name))
hpg_store_info <- hpg_store_info %>% mutate(hpg_genre_name = as.factor(hpg_genre_name),
hpg_area_name = as.factor(hpg_area_name))
date_info <- date_info %>% mutate(holiday_flg = as.logical(holiday_flg),
date = ymd(calendar_date))Here we have a first look at the distribution of the feature in our individual data files before combining them for a more detailed analysis.
p1 <- air_visit %>%
group_by(visit_date) %>%
summarise(all_visitors = sum(visitors)) %>%
ggplot(aes(visit_date, all_visitors)) +
geom_line(col = "blue") +
labs(y = "All visitors", x = "Date")
p2 <- air_visit %>%
ggplot(aes(visitors)) +
geom_vline(xintercept = 20, color ="red") +
geom_histogram(fill = "blue", bins = 30) +
#xlim(0,150)
scale_x_log10()
p3 <- air_visit %>%
mutate(wday = wday(visit_date, label = TRUE)) %>%
group_by(wday) %>%
summarise(visits = median(visitors)) %>%
ggplot(aes(wday, visits, fill = wday)) +
geom_col() +
theme(legend.position = "none") +
labs(x = "Day of the week", y = "Median Visitors")
p4 <- air_visit %>%
mutate(Month = month(visit_date, label = TRUE)) %>%
group_by(Month) %>%
summarise(visits = median(visitors)) %>%
ggplot(aes(Month, visits, fill = Month)) +
geom_col() +
theme(legend.position = 'none') +
labs(x = "MOnth", y = "Median Visitors")
layout <- matrix(c(1,1,1,1,2,3,4,4), 2, 4, byrow = TRUE)
multiplot(p1, p2, p3, p4, layout=layout)We will be forecasting for the last week of April plus May 2017, so let’s look at this time range in our 2016 training data:
air_visit %>%
filter(visit_date > ymd("2016-04-15") & visit_date < ymd("2016-06-15")) %>%
group_by(visit_date) %>%
summarise(all_visitors = sum(visitors)) %>%
ggplot(aes(visit_date, all_visitors)) +
geom_line() +
geom_smooth(method = "loess", color = "blue", span = 1/10) +
labs(y = "All visitors", x = "Date")foo <- air_reserve %>%
mutate(reserve_date = date(reserve_datetime),
reserve_hour = hour(reserve_datetime),
reserve_wday = wday(reserve_datetime, label = TRUE),
visit_date = date(visit_datetime),
visit_hour = hour(visit_datetime),
visit_wday = wday(visit_datetime, label = TRUE),
diff_hour = time_length(visit_datetime - reserve_datetime, unit = "hour"),
diff_day = time_length(visit_datetime - reserve_datetime, unit = "day"))
p1 <- foo %>%
group_by(visit_date) %>%
summarise(all_vitisors = sum(reserve_visitors)) %>%
ggplot(aes(visit_date, all_vitisors)) +
geom_line() +
labs(x = "Air visit date")
p2 <- foo %>%
group_by(visit_hour) %>%
summarise(all_visitors = sum(reserve_visitors)) %>%
ggplot(aes(visit_hour, all_visitors)) +
geom_col(fill = "blue") +
labs(x = "Time from reservation to visit [hours]")
p3 <- foo %>%
filter(diff_hour < 24 *7) %>%
group_by(diff_hour) %>%
summarise(all_visitors = sum(reserve_visitors)) %>%
ggplot(aes(diff_hour, all_visitors)) +
geom_col(fill= "blue") +
labs(x = "Time from reservation (hours)", y = "all_visitors")
layout <- matrix(c(1, 1, 2, 3), 2, 2, byrow = TRUE)
multiplot(p1, p2, p3, layout = layout)too <- hpg_reserve %>%
mutate(visit_date = date(visit_datetime),
visit_hour = hour(visit_datetime),
visit_wday = wday(visit_datetime),
reserve_date = date(reserve_datetime),
reserve_hour = hour(reserve_datetime),
reserve_wday = wday(reserve_datetime),
diff_hour = time_length(visit_datetime - reserve_datetime, unit = "hour"),
diff_day = time_length(visit_datetime - reserve_datetime, unit = "day")
)
p1 <- too %>%
group_by(visit_date) %>%
summarise(all_visitors = sum(reserve_visitors)) %>%
ggplot(aes(visit_date, all_visitors)) +
geom_line()
p2 <- too %>%
group_by(visit_hour) %>%
summarise(all_visitors = sum(reserve_visitors)) %>%
ggplot(aes(visit_hour, all_visitors)) +
geom_col(fill="red")
p3 <- too %>%
group_by(diff_hour) %>%
summarise(all_visitors = sum(reserve_visitors)) %>%
ggplot(aes(diff_hour, all_visitors)) +
geom_col(fill = "red") +
xlim(0,150)
layout <- matrix(c(1,1,2,3), 2, 2, byrow = TRUE)
multiplot(p1, p2, p3, layout = layout)## Warning: Removed 1978 rows containing missing values (position_stack).
## Warning: Removed 2 rows containing missing values (geom_col).
leaflet(air_store_info) %>%
addTiles() %>%
addProviderTiles("CartoDB.Positron") %>%
addMarkers(~longitude, ~latitude,
popup = ~air_store_id, label = ~air_genre_name,
clusterOptions = markerClusterOptions()
)p1 <- air_store_info %>%
group_by(air_area_name) %>%
count() %>%
ungroup() %>%
top_n(15,n) %>%
ggplot(aes(reorder(air_area_name, n, FUN = min), n, fill = air_area_name )) +
geom_col()+
coord_flip()+
theme(legend.position = "none") +
labs(x= "Top 15 area (air_aera_name)", y ="Number of air restaurants")
p2 <- air_store_info %>%
group_by(air_genre_name) %>%
count() %>%
#ungroup() %>%
top_n(15, n) %>%
ggplot(aes(reorder(air_genre_name, n), n, fill = air_genre_name)) +
geom_col() +
coord_flip()+
theme(legend.position = "none")
layout <- matrix(c(1,2), 2,1, byrow = TRUE)
multiplot(p1,p2, layout = layout)leaflet(hpg_store_info) %>%
addTiles() %>%
addProviderTiles("cartoDB.positron") %>%
addMarkers(~longitude, ~latitude,
popup = ~hpg_store_id, label = ~hpg_genre_name,
clusterOptions = markerClusterOptions()
)p1 <- hpg_store_info %>%
group_by(hpg_genre_name) %>%
count() %>%
ggplot(aes(reorder(hpg_genre_name, n, FUN = min), n, fill = hpg_genre_name)) +
geom_col() +
coord_flip() +
theme(legend.position = "none") +
labs(x = "Type of cuisine (hpg_genre_name)", y = "Number of hpg restaurants")
p2 <- hpg_store_info %>%
group_by(hpg_area_name) %>%
count() %>%
arrange(desc(n)) %>%
head(15) %>%
ggplot(aes(reorder(x = hpg_area_name, n, FUN = min), n, fill = hpg_area_name)) +
geom_col() +
coord_flip() +
theme(legend.position = "none") +
labs(x = "Top 15 areas (hpg_area_name)", y = "Number of hpg restaurants")
layout <- matrix(c(1,2), 1,2, byrow=TRUE)
multiplot(p1, p2, layout = layout)date_info %>%
group_by(holiday_flg) %>%
count() %>%
ggplot(aes(x = holiday_flg, y = n, fill = holiday_flg)) +
geom_col() +
theme(legend.position = "none")date_info %>%
filter(date > ymd("2017-04-15") & date < ymd("2017-06-01")) %>%
ggplot(aes(x = date, y= holiday_flg, color = holiday_flg)) +
geom_point() +
theme(legend.position = "none")air_visit_train <- air_visit %>%
rename(date = visit_date) %>%
distinct(date) %>%
mutate(dset= "train")
bar <- sample_submission %>%
tidyr::separate(id, c("foo","bar", "date"), sep = "_") %>%
mutate(date = ymd(date)) %>%
distinct(date) %>%
mutate(dset = "test")
air_visit_train <- air_visit_train %>%
bind_rows(bar)
year(air_visit_train$date) <- 2017
air_visit_train %>%
filter(!is.na(date)) ## # A tibble: 516 x 2
## date dset
## <date> <chr>
## 1 2017-01-13 train
## 2 2017-01-14 train
## 3 2017-01-15 train
## 4 2017-01-16 train
## 5 2017-01-18 train
## 6 2017-01-19 train
## 7 2017-01-20 train
## 8 2017-01-21 train
## 9 2017-01-22 train
## 10 2017-01-23 train
## # … with 506 more rows
#mutate(year = forcats::fct_relevel(as.factor(year), c("2017", "2016")))air_visit %>%
left_join(air_store_info, by = "air_store_id") %>%
group_by(visit_date, air_genre_name) %>%
summarise(mean_visitors = mean(visitors)) %>%
ungroup() %>%
ggplot(aes(visit_date, mean_visitors, color = air_genre_name)) +
geom_line() +
labs(y = "Average number of visitors to 'air' restaurants", x = "Date") +
theme(legend.position = "none") +
scale_y_log10() +
facet_wrap(~ air_genre_name)air_visit %>%
left_join(air_store_info, by = "air_store_id") %>%
mutate(wday = wday(visit_date, label = TRUE)) %>%
group_by(wday, air_genre_name) %>%
summarise(mean_visitors = mean(visitors)) %>%
ggplot(aes(y = mean_visitors, x= air_genre_name, color = wday)) +
geom_point() +
theme(legend.position = "left", axis.text.y = element_blank(),
plot.title = element_text(size = 14)) +
coord_flip() +
scale_color_hue()air_visit %>%
left_join(air_store_info, by = "air_store_id") %>%
group_by(air_genre_name) %>%
ggplot(aes(x = visitors, y = air_genre_name)) +
ggridges::geom_density_ridges(bandwidth = 0.1) +
scale_x_log10() +
ggridges::scale_fill_cyclical(values = c("blue", "red"))foo <- air_visit %>%
#group_by(visit_date) %>%
mutate(calendar_date = as.character(visit_date)) %>%
left_join(date_info, by="calendar_date")
foo %>%
ggplot(aes(holiday_flg, visitors, color = holiday_flg)) +
geom_boxplot() +
scale_y_log10() +
theme(legend.position = "none")foo %>%
mutate(wday = wday(date, label = TRUE)) %>%
group_by(wday, holiday_flg) %>%
summarise(visitors_mean = mean(visitors)) %>%
ggplot(aes(x= wday, visitors_mean, color = holiday_flg)) +
geom_point()air_store_info %>%
#mutate(air_area_name = str_sub(air_area_name, 1, 12))
group_by(air_area_name) %>%
count(air_genre_name) %>%
head(50) %>%
ggplot(aes(x = air_area_name, y = air_genre_name)) +
geom_point(aes(size = n)) +
theme(axis.text.x = element_text(angle=45, hjust = 1, vjust= 0.9))hpg_store_info %>%
mutate(area = str_sub(hpg_area_name, 1, 10)) %>%
group_by(area) %>%
count(hpg_genre_name) %>%
head(100) %>%
ggplot(aes(x= area, y = hpg_genre_name )) +
geom_point(aes(size = n)) +
theme(legend.position = "bottom",axis.text.x = element_text(angle=45, hjust = 1, vjust= 0.9))air_store_info %>%
group_by(air_genre_name) %>%
count(air_area_name) %>%
ggplot(aes(reorder(x= air_genre_name, n, FUN = mean ), y = n)) +
scale_y_log10() +
geom_boxplot() +
geom_jitter(color ="blue") +
coord_flip()foo <- air_visit %>%
left_join(air_store_info, by = "air_store_id")
bar <- air_store_info %>%
group_by(air_genre_name, air_area_name) %>%
count()
foobar <- hpg_store_info %>%
group_by(hpg_genre_name, hpg_area_name) %>%
count()
p1 <- bar %>%
ggplot(aes(n)) +
geom_histogram(fill = "blue", binwidth = 1) +
labs(x = "Air genres per aera")
p2 <- foobar %>%
ggplot(aes(n)) +
geom_histogram(fill = "red", binwidth = 1) +
labs(x = "HPG genres per area")
P3 <- foo %>%
group_by(air_genre_name, air_area_name) %>%
summarise(mean_log_visit = mean(log1p(visitors))) %>%
left_join(bar, by = c("air_genre_name","air_area_name")) %>%
group_by(n) %>%
summarise(mean_mlv = mean(mean_log_visit),
sd_mlv = sd(mean_log_visit)) %>%
tidyr::replace_na(list(sd_mlv = 0)) %>%
ggplot(aes(n, mean_mlv)) +
geom_point(color = "blue", size = 4) +
geom_errorbar(aes(ymin = mean_mlv - sd_mlv, ymax = mean_mlv + sd_mlv), width = 0.5, size = 0.7, color = "blue") +
labs(x="Case of identifcation Air genres per area", y = "Mean +/- SD of \n mean Logp1 visitors")foo <- air_reserve %>%
mutate(visit_date = date(visit_datetime)) %>%
group_by(air_store_id, visit_date) %>%
summarise(reserve_visitors_air = sum(reserve_visitors))
bar <- hpg_reserve %>%
mutate(visit_date = date(visit_datetime)) %>%
group_by(hpg_store_id, visit_date) %>%
summarise(reserve_visitors_hpg = sum(reserve_visitors)) %>%
inner_join(store_id_relation, by = "hpg_store_id")
all_reserve <- air_visit %>%
inner_join(foo, by = c("air_store_id", "visit_date")) %>%
inner_join(bar, by = c("air_store_id", "visit_date")) %>%
mutate(reserve_visitors = reserve_visitors_air + reserve_visitors_hpg)
p <- all_reserve %>%
filter(reserve_visitors < 120 ) %>%
ggplot(aes(reserve_visitors, visitors )) +
geom_point(color = "black", alpha = 0.5) +
geom_abline(slope = 1, intercept = 0, color = "grey60") +
geom_smooth(method = "lm", color = "blue")
ggMarginal(p, type = "histogram", fill = "blue", bins=50)p1 <- all_reserve %>%
ggplot(aes(visitors - reserve_visitors)) +
geom_histogram(binwith = 5 , fill = "black") +
coord_flip() +
labs(x = "")## Warning: Ignoring unknown parameters: binwith
p2 <- all_reserve %>%
ggplot(aes(visitors - reserve_visitors_air)) +
geom_histogram(binwidth = 5, fill = "blue") +
coord_flip() +
labs(x = "")
p3 <- all_reserve %>%
ggplot(aes(visitors - reserve_visitors_hpg)) +
geom_histogram(binwidth = 5, fill = "red") +
coord_flip() +
labs(x = "")
p4 <- all_reserve %>%
ggplot(aes(visit_date, visitors - reserve_visitors)) +
geom_hline(yintercept = c(150, 0, -250)) +
geom_line() +
geom_line(aes(visit_date, visitors - reserve_visitors_air +150), color = "blue") +
geom_line(aes(visit_date, visitors - reserve_visitors_hpg - 250), color = "red") +
ggtitle("Visitors - Reserved: all (black), air (blue), hpg (red)")
layout <- matrix(c(4,4,2,4,4,1,4,4,3), 3, 3, byrow = TRUE)
multiplot(p1, p2, p3, p4, layout = layout)## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
all_reserve %>%
mutate(date = visit_date) %>%
left_join(date_info, by = "date") %>%
ggplot(aes(visitors - reserve_visitors, fill = holiday_flg)) +
geom_density(alpha = 0.5)air_visit <- air_visit %>%
mutate(wday = wday(visit_date, label = TRUE),
wday = forcats::fct_relevel(wday, c("Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun")), month = month(visit_date, label = TRUE))
air_reserve <- air_reserve %>%
mutate(reserve_date = date(reserve_datetime),
reserve_hour = hour(reserve_datetime),
reserve_wday = wday(reserve_datetime, label = TRUE),
reserve_wday = forcats::fct_relevel(reserve_wday, c("Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun")),
visit_date = date(visit_datetime),
visit_hour = hour(visit_datetime),
visit_wday = wday(visit_datetime, label = TRUE),
viist_wday = forcats::fct_relevel(visit_wday, c("Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun")),
diff_hour = time_length(visit_datetime - reserve_datetime, unit = "hour"),
diff_day = time_length(visit_datetime - reserve_datetime, unit = "day"))
hpg_reserve %>%
mutate(reserve_date = date(reserve_datetime),
reserve_hour = hour(reserve_datetime),
reserve_wday = wday(reserve_datetime, label = TRUE),
reserve_wday = forcats::fct_relevel(reserve_wday, c("Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun")),
visit_date = date(visit_datetime),
visit_hour = hour(visit_datetime),
visit_wday = wday(visit_datetime, label = TRUE),
visit_wday = forcats::fct_relevel(visit_wday, c("Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun")),
diff_hour = time_length(visit_datetime - reserve_datetime, unit = "hour"),
diff_day = time_length(visit_datetime - reserve_datetime, unit = "day"))## # A tibble: 2,000,320 x 12
## hpg_store_id visit_datetime reserve_datetime reserve_visitors
## <chr> <dttm> <dttm> <int>
## 1 hpg_c63f6f4… 2016-01-01 11:00:00 2016-01-01 09:00:00 1
## 2 hpg_dac7278… 2016-01-01 13:00:00 2016-01-01 06:00:00 3
## 3 hpg_c8e24dc… 2016-01-01 16:00:00 2016-01-01 14:00:00 2
## 4 hpg_24bb207… 2016-01-01 17:00:00 2016-01-01 11:00:00 5
## 5 hpg_25291c5… 2016-01-01 17:00:00 2016-01-01 03:00:00 13
## 6 hpg_28bdf7a… 2016-01-01 17:00:00 2016-01-01 15:00:00 2
## 7 hpg_2a01a04… 2016-01-01 17:00:00 2016-01-01 17:00:00 2
## 8 hpg_2a84dd9… 2016-01-01 17:00:00 2016-01-01 15:00:00 2
## 9 hpg_2ad1798… 2016-01-01 17:00:00 2016-01-01 13:00:00 2
## 10 hpg_2c1d989… 2016-01-01 17:00:00 2016-01-01 15:00:00 6
## # … with 2,000,310 more rows, and 8 more variables: reserve_date <date>,
## # reserve_hour <int>, reserve_wday <ord>, visit_date <date>,
## # visit_hour <int>, visit_wday <ord>, diff_hour <dbl>, diff_day <dbl>
# count stores in area
air_count <- air_store_info %>%
group_by(air_area_name) %>%
summarise(air_count = n())
hpg_count <- hpg_store_info %>%
group_by(hpg_area_name) %>%
summarise(hpg_count = n())
# distance
med_coord_air <- air_store_info %>%
summarise_at(vars(longitude:latitude), median)
med_coord_hpg <- hpg_store_info %>%
summarise_at(vars(longitude:latitude), median)
air_coords <- air_store_info %>%
select(longitude, latitude)
hpg_coords <- hpg_store_info %>%
select(longitude, latitude)
air_store_info$dist <- geosphere::distCosine(air_coords, med_coord_air) / 1e3
hpg_store_info$dist <- geosphere::distCosine(hpg_coords, med_coord_hpg) / 1e3
# apply counts, dist; add prefecture
air_store_info %>%
mutate(dist_group = as.integer(case_when(
dist < 80 ~ 1,
dist < 300 ~ 2,
dist < 500 ~ 3,
dist < 750 ~ 4,
TRUE ~ 5 ))) %>%
left_join(air_count, by = "air_area_name") %>%
separate(air_area_name, c("preferture"), sep = " ", remove = FALSE)## Warning: Expected 1 pieces. Additional pieces discarded in 829 rows [1, 2,
## 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].
## # A tibble: 829 x 9
## air_store_id air_genre_name air_area_name preferture latitude longitude
## <chr> <fct> <fct> <chr> <dbl> <dbl>
## 1 air_0f0cdee… Italian/French Hyōgo-ken Kō… Hyōgo-ken 34.7 135.
## 2 air_7cc17a3… Italian/French Hyōgo-ken Kō… Hyōgo-ken 34.7 135.
## 3 air_fee8dcf… Italian/French Hyōgo-ken Kō… Hyōgo-ken 34.7 135.
## 4 air_a17f077… Italian/French Hyōgo-ken Kō… Hyōgo-ken 34.7 135.
## 5 air_83db5af… Italian/French Tōkyō-to Min… Tōkyō-to 35.7 140.
## 6 air_99c3eae… Italian/French Tōkyō-to Min… Tōkyō-to 35.7 140.
## 7 air_f183a51… Italian/French Tōkyō-to Min… Tōkyō-to 35.7 140.
## 8 air_6b9fa44… Italian/French Tōkyō-to Min… Tōkyō-to 35.7 140.
## 9 air_0919d54… Italian/French Tōkyō-to Min… Tōkyō-to 35.7 140.
## 10 air_2c6c79d… Italian/French Tōkyō-to Min… Tōkyō-to 35.7 140.
## # … with 819 more rows, and 3 more variables: dist <dbl>,
## # dist_group <int>, air_count <int>
air_visit %>%
group_by(wday) %>%
summarise(mean_log_visitors = mean(log1p(visitors)),
sd_log_visitors = sd(log1p(visitors))) %>%
ggplot(aes(wday, mean_log_visitors, color = wday)) +
geom_point(size = 4) +
geom_errorbar(aes(ymin = mean_log_visitors - sd_log_visitors,
ymax = mean_log_visitors + sd_log_visitors,
color = wday), width = 0.5, size = 0.7 ) +
theme(legend.position = "none")p2 <- air_visit %>%
mutate(visitors = log1p(visitors)) %>%
ggplot(aes(visitors, wday, fill = wday)) +
ggridges::geom_density_ridges(bandwidth = 0.1) +
scale_x_log10() +
theme(legend.position = "none") +
labs(x= "log1P(visitors", y ="")
p3 <- air_visit %>%
group_by(month) %>%
summarise(mean_log_visitors = mean(log1p(visitors)),
sd_log_visitors = sd(log1p(visitors))) %>%
ggplot(aes(month, mean_log_visitors, color = month)) +
geom_point(size = 4) +
geom_errorbar(aes(ymin = mean_log_visitors - sd_log_visitors,
ymax = mean_log_visitors + sd_log_visitors,
color = month), width = 0.5, size = 0.7) +
theme(legend.position = "none")
p4 <- air_visit %>%
mutate(visitors = log1p(visitors)) %>%
ggplot(aes(visitors, month, fill = month)) +
geom_density_ridges(bandwidth = 0.1) +
scale_x_log10() +
theme(legend.position = "none") +
labs(x = "log1p(visitors)", y = "")
layout <- matrix(c(1,2,3,4),2,2,byrow=TRUE)
multiplot(p1, p2, p3, p4, layout=layout)## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
air_visit %>%
left_join(air_store_info, by = "air_store_id") %>%
group_by(air_store_id, air_genre_name) %>%
summarise(mean_log_visits = mean(log1p(visitors)),
mean_log_visits = mean(log1p(visitors)),
sd_log_visits = sd(log1p(visitors))) %>%
ungroup()## # A tibble: 829 x 4
## air_store_id air_genre_name mean_log_visits sd_log_visits
## <chr> <fct> <dbl> <dbl>
## 1 air_00a91d42b08b08d9 Italian/French 3.17 0.577
## 2 air_0164b9927d20bcc3 Italian/French 2.12 0.685
## 3 air_0241aa3964b7f861 Izakaya 2.24 0.565
## 4 air_0328696196e46f18 Dining bar 1.97 0.663
## 5 air_034a3d5b40d5b1b1 Cafe/Sweets 2.46 0.824
## 6 air_036d4f1ee7285390 Cafe/Sweets 3.02 0.535
## 7 air_0382c794b73b51ad Cafe/Sweets 3.01 0.767
## 8 air_03963426c9312048 Izakaya 3.47 0.762
## 9 air_04341b588bde96cd Izakaya 3.49 0.543
## 10 air_049f6d5b402a31b2 Japanese food 2.37 0.609
## # … with 819 more rows
params_ts1 <- function(rownr){
bar <- air_visit %>%
filter(air_store_id == foo$air_store_id[rownr])
slope <- summary(lm(visitors ~ visit_date, data = bar))$coef[2]
slope_err <- summary(lm(visitors ~ visit_date, data = bar))$coef[4]
foobar <- tibble(
air_store_id = foo$air_store_id[rownr],
slope = slope,
slope_err = slope_err
)
return(foobar)
}
params <- params_ts1(1)
for (i in seq(2,nrow(foo))){
params <- bind_rows(params, params_ts1(i))
}
ts_params <- foo %>%
left_join(params, by = "air_store_id")